full <- readr::read_csv("train.csv")
set.seed(314)
test <-
full %>%
group_by(target) %>%
sample_frac(0.2) %>%
ungroup()
train <- full %>% filter(!id %in% test$id)
Técnicas de redução de dimensionalidade serão aplicadas a fim de entender de maneira visual como os dados estão distribuidos.
set.seed(314)
to_pca <-
train %>%
group_by(target) %>%
sample_frac(.1) %>%
ungroup()
to_pca %>%
select(-id, -target) %>%
cor(method = 'spearman') %>%
heatmaply::heatmaply_cor(
xlab = "Features",
ylab = "Features",
k_col = 4,
k_row = 8
)
A correlação é inexistente entre as features deste dataset, o que inviabiliza o uso de alguns recursos de redução de dimensionalidade
res.pca <-
to_pca %>%
select(-id, -target) %>%
FactoMineR::PCA(graph = FALSE, scale.unit = T)
# get_eigenvalue(res.pca)
factoextra::fviz_eig(res.pca, addlabels = TRUE, ncp = 50)
Este algoritmo é uma ótima ferramenta para ajudar na compreensão de dados de alta dimensionalidade porém não é tão útil para aplicar a redução de dimensionalidade para treinamento de modelos de machine learning
tsne_tps <- to_pca %>%
select(-id, -target) %>%
Rtsne::Rtsne(dims=2, perplexity=30,
PCA=FALSE,
verbose=T, max_iter=500,
check_duplicates = FALSE)
saveRDS(tsne_tps, "tsne_tps.rds")
tsne_tps <- readRDS("tsne_tps.rds")
as_tibble(tsne_tps$Y) %>%
bind_cols(select(to_pca, target)) %>%
ggplot(aes(x=V1, y=V2, col=target))+
geom_point()+
labs(title = "t-SNE")
umap_tps <- umap::umap(select(to_tsne, -id, -target))
saveRDS(umap_tps, "umap_tps.rds")
umap_tps <- readRDS("umap_tps.rds")
# predict(umap_tps, select(test, -id, -target))
as_tibble(umap_tps$layout) %>%
bind_cols(select(train, target)) %>%
ggplot(aes(x=V1, y=V2, col=target))+
geom_point()+
labs(title = "UMAP")
batch_size <- 128
epochs <- 500
x_train <-
full %>%
select(-id, -target) %>%
mutate_all(scale) %>%
as.matrix()
model <- keras_model_sequential()
model %>%
layer_dense(units = 564, activation = "relu", input_shape = ncol(x_train)) %>%
layer_dense(units = 256, activation = "relu") %>%
layer_dense(units = 2, activation = "relu", name = "bottleneck") %>%
layer_dense(units = 256, activation = "relu") %>%
layer_dense(units = 564, activation = "relu") %>%
layer_dense(units = ncol(x_train), activation = "linear")
summary(model)
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_4 (Dense) (None, 564) 28764
## ________________________________________________________________________________
## dense_3 (Dense) (None, 256) 144640
## ________________________________________________________________________________
## bottleneck (Dense) (None, 2) 514
## ________________________________________________________________________________
## dense_2 (Dense) (None, 256) 768
## ________________________________________________________________________________
## dense_1 (Dense) (None, 564) 144948
## ________________________________________________________________________________
## dense (Dense) (None, 50) 28250
## ================================================================================
## Total params: 347,884
## Trainable params: 347,884
## Non-trainable params: 0
## ________________________________________________________________________________
model %>% compile(
metrics = c("accuracy"),
loss = "mean_squared_error",
optimizer = optimizer_adam(
lr = 0.001 ))
tictoc::tic()
history <- model %>%
fit(x = x_train, y = x_train,
epochs = epochs,
validation_split =.2,
view_metrics = TRUE,
callbacks=list(callback_early_stopping(
monitor = "val_loss",
min_delta = 0.01,
patience = 50,
restore_best_weights = TRUE
)),
verbose=2)
tictoc::toc()
model %>% save_model_tf("model_ae")
saveRDS(history, "history_ae.rds")
model <- load_model_tf("model_ae")
history <- readRDS("history_ae.rds")
history
##
## Final epoch (plot to see history):
## loss: 0.656
## accuracy: 0.4906
## val_loss: 0.6757
## val_accuracy: 0.4726
# gap = epochs - nrow(as_metrics_df(history))
# gap_tbl = tibble(loss= rep(NA_real_, gap),
# accuracy= rep(NA_real_, gap),
# val_loss= rep(NA_real_, gap),
# val_accuracy= rep(NA_real_, gap)
# )
# bind_rows(as_metrics_df(history), gap_tbl ) %>%
as_metrics_df(history) %>%
mutate(epochs = 1:nrow(.)) %>%
gather(key, val, -epochs) %>%
mutate(metric = case_when(
str_detect(key, "accuracy") ~ "accuracy",
str_detect(key, "loss") ~ "log_loss" )) %>%
ggplot(aes(x = epochs, y = val, col=key)) +
geom_point()+
geom_smooth(se = F)+
theme_bw()+
facet_wrap(~metric, scales = "free_y")
# evaluate the performance of the model
mse.ae2 <- evaluate(model, x_train, x_train)
mse.ae2
## loss accuracy
## 0.6329905 0.5052900
intermediate_layer_model <- keras_model(inputs = model$input, outputs = get_layer(model, "bottleneck")$output)
intermediate_output <- predict(intermediate_layer_model, x_train)
ggplot(data.frame(PC1 = intermediate_output[,1],
PC2 = intermediate_output[,2]),
aes(x = PC1, y = PC2, col = full$target)) +
geom_point(alpha=.5)
# train_dae <- predict(model, x_train)
# train_dae %>% as_tibble()
samples <- sample(1:2, size = nrow(full), replace = T, prob = c(0.8, 0.2))
# set training data
x_train <-
full %>%
filter(samples==1) %>%
select(-id, -target) %>%
mutate_all(scale) %>%
as.matrix()
x_val <-
full %>%
filter(samples==2) %>%
select(-id, -target) %>%
mutate_all(scale) %>%
as.matrix()
model <- keras_model_sequential()
model %>%
layer_dense(units = 564, activation = "relu", input_shape = ncol(x_train)) %>%
layer_dense(units = 256, activation = "relu") %>%
layer_dense(units = 2, activation = "relu", name = "bottleneck") %>%
layer_dense(units = 256, activation = "relu") %>%
layer_dense(units = 564, activation = "relu") %>%
layer_dense(units = ncol(x_train), activation = "linear")
summary(model)
## Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type) Output Shape Param #
## ================================================================================
## dense_9 (Dense) (None, 564) 28764
## ________________________________________________________________________________
## dense_8 (Dense) (None, 256) 144640
## ________________________________________________________________________________
## bottleneck (Dense) (None, 2) 514
## ________________________________________________________________________________
## dense_7 (Dense) (None, 256) 768
## ________________________________________________________________________________
## dense_6 (Dense) (None, 564) 144948
## ________________________________________________________________________________
## dense_5 (Dense) (None, 50) 28250
## ================================================================================
## Total params: 347,884
## Trainable params: 347,884
## Non-trainable params: 0
## ________________________________________________________________________________
model %>% compile(
metrics = c("accuracy"),
loss = "mean_squared_error",
optimizer = optimizer_adam(
lr = 0.001 ))
batch_generator <- function(data, batch_size, perc=.35)
{
function() {
data_original <- data
ids <- sample(1:nrow(data), batch_size, replace = FALSE)
tryCatch({
for(j in 1:ncol(data)){
to_shuffle <- sample(c(T, F), size = nrow(data[ids, ]), replace = T, prob = c(perc, 1-perc))
data[ids,][which(to_shuffle), j] <- sample(data[ids,][which(to_shuffle), j])
}
}, error=function(e){
print("Deu merda no shuffle!")
data = data
})
list(
as.matrix(data[ids, ]),
as.matrix(data_original[ids, ])
)
}
}
steps_per_epoch <- 600 #floor(nrow(x_train)/batch_size)
validation_steps <- 100 #floor(nrow(x_val)/batch_size)
tictoc::tic()
history <- model %>%
fit(batch_generator(x_train, batch_size),
steps_per_epoch = steps_per_epoch,
validation_steps = validation_steps,
validation_data = batch_generator(x_val, batch_size),
epochs = epochs,
view_metrics = TRUE,
callbacks=list(callback_early_stopping(
monitor = "val_loss",
min_delta = 0.01,
patience = 50,
restore_best_weights = TRUE
)),
verbose=2)
tictoc::toc()
saveRDS(model, "model_dae.rds")
saveRDS(history, "history_dae.rds")
history <- readRDS("history_dae.rds")
history
##
## Final epoch (plot to see history):
## loss: 0.7782
## accuracy: 0.3716
## val_loss: 0.7931
## val_accuracy: 0.3698
gap = epochs - nrow(as_metrics_df(history))
gap_tbl = tibble(loss= rep(NA_real_, gap),
accuracy= rep(NA_real_, gap),
val_loss= rep(NA_real_, gap),
val_accuracy= rep(NA_real_, gap)
)
bind_rows(as_metrics_df(history), gap_tbl ) %>%
mutate(epochs = 1:nrow(.)) %>%
gather(key, val, -epochs) %>%
mutate(metric = case_when(
str_detect(key, "accuracy") ~ "accuracy",
str_detect(key, "loss") ~ "log_loss" )) %>%
ggplot(aes(x = epochs, y = val, col=key)) +
geom_point()+
geom_smooth(se = F)+
theme_bw()+
facet_wrap(~metric, scales = "free_y")